knitr::opts_chunk$set(echo = TRUE, eval=FALSE)

Create Auto Clustering in R as Data Frame

Download coding:

evp_data<-get_coding()
assign_coding_to_environment(evp_data)
er<-event_reports %>%
  mutate(election_start_date=date_to_election(date(dmy(event_start)), election_start_col="dissolution", election_end_col="commencement"),
         election_end_date=date_to_election(date(dmy(event_end)), election_start_col="dissolution", election_end_col="commencement"),
         election = ifelse(election_start_date==election_end_date, election_start_date, "ambiguous"))

er06<-er[er$election=="1906",]
location06<-processed_locations %>% inner_join(er06, by="event_report_id")

_

reg_tv <-function(x){
  # if(!is.na(x)){
  y <- x
    x <- stringr::str_squish(x)
    x <- stringr::str_to_title(x)
    x <- paste0(x, ", UK")

    x[is.na(y)]<- NA

  x
}

locationstocode<-location06 %>%
  group_by(towns_villages) %>%
  mutate(towns_village_reg=reg_tv(towns_villages)) %>%
  group_by(towns_village_reg) %>%
  summarize() %>%
  drop_na()

Geocode unique locations

locationscoded <- cbind(locationstocode, ggmap::geocode(locationstocode$towns_village_reg))
location06<-location06 %>%
  mutate(towns_village_reg=reg_tv(towns_villages)) %>%
  left_join(locationscoded)

hasloc06 <- drop_na(location06, lat, lon)
unspecificloc06<-filter(location06, is.na(lat)|is.na(lon))
hasloc06_sf<-st_as_sf(hasloc06,  crs = 4326, coords = c("lon", "lat"))

kilodist<- sf::st_distance(hasloc06_sf) %>%
  units::set_units("kilometers") %>%
  as.dist()
geoclusters<-hclust(kilodist)
threshold.in.kilometers <- 5
locationcomp06$geocluster <- cutree(geoclusters, h=threshold.in.kilometers)

Subset to specifically identified locations (has town village lat lon). Cluster on these

loccompdatespec<-locationcomp06 %>%
  mutate(specific_date=date(dmy(event_start))==date(dmy(event_end)) & event_timeframe_quantifier=="during", event_date=date(dmy(event_start))) %>%
  filter(specific_date)

Within geographical locations cluster on dates:

tc<-loccompdatespec %>%
  st_drop_geometry() %>%
  group_by(geocluster) %>%
  nest()


adddateclust <- function(x, h=8){
  if(nrow(x)<2){
    x$withingeodatecluster<-1
    return(x)
  }
  these_dates<-pull(x, event_date)
  x$withingeodatecluster =  cutree(hclust(dist(these_dates-min(these_dates))), h=h)
  x
}
tc2<-tc %>%
  mutate(data = purrr::map(data, adddateclust)) %>%
  unnest() %>%
  group_by(geocluster, withingeodatecluster) %>%
  mutate(geodatecluster=group_indices())

Identify singleton

singletons <- tc2 %>% 
  group_by(geodatecluster) %>% 
  tally() %>% 
  filter(n==1) %>% 
  inner_join(tc2, by="geodatecluster")

Autoclusters are non-singleton clusters

autoclusters <- tc2 %>% 
  group_by(geodatecluster) %>% 
  tally() %>% 
  filter(n>1) %>% 
  inner_join(tc2, by="geodatecluster")

Need to deal with non-specific places: Identify area where events took place in for non-specific locations (e.g. the county)

Verification Step: Assign Auto Clusters to User

Create autoclusters on database and store created autocluster_ids in a vector:

autocluster_ids<-assign_clusters_from_df(tc2)

Assign newly created autocluster_ids to a user (e.g. user 3) for verification:

assign_autoclusters_to_user(autocluster_ids, 3)

Reallocation Step: Assign Singletons to Clusters

Find event reports of singletons (clusters of size 1 and events not in a cluster)

singleton_event_report_ids <- c(1) # need to write code to do this

Assign singletons to users for reallocation

assign_reallocations_to_user(event_report_ids = singleton_event_report_ids, user_id = 3)

Combination Step: Consider verified clusters for combination

Find verified clusters

verified_clusters<-get_verified_clusters()
verified_cluster_ids<-unique(verified_clusters$verified_cluster_id)
some_verified_cluster_ids <- verified_cluster_ids[10:20]

Assign verified (and combined) clusters to users for combination

assign_verified_clusters_to_user(some_verified_cluster_ids, user_id = 6)


gidonc/durhamevp documentation built on April 8, 2022, 10:31 a.m.